# -*- coding: utf-8 -*-
import re
from goose3 import Goose
from goose3.text import StopWordsChinese

# 初始化，设置中文分词



# 提取一个列表
from common.request import requestutil


def items(response, selector_pattern="a", text_pattern=None, fields=["href"]):
    item_nodes = response.html.find(selector_pattern)
    itemss = []
    for item_node in item_nodes:
        if text_pattern:
            if not re.search(text_pattern, item_node.text):
                continue
        result = {}
        for field in fields:
            fv = item_node.attrs.get(field, "")
            result[field] = fv
        result["text"] = item_node.text
        itemss.append(result)
    return itemss


# 自动提取正文
def content(response, selector=None):
    g = Goose({'stopwords_class': StopWordsChinese})
    page = g.extract(raw_html=response.text)
    # # 标题
    # print('标题：', page.title)
    clean_text = page.cleaned_text
    clean_text = clean_text.replace("\n", "")
    return clean_text


def first_node_text(root_node, selecter):
    try:
        nodes = root_node.find(selecter)
        if len(nodes) == 0:
            return ""
        node = nodes[0]
        full_text = node.full_text
        return full_text
    except Exception as e:
        return "Exception" + str(e)


def first_node_attr(node, selecter, attr):
    try:
        nodes = node.find(selecter)
        if len(nodes) == 0:
            return ""
        node = nodes[0]
        attr_value = node.attrs[attr]
        return attr_value
    except Exception as e:
        return "Exception" + str(e)


if __name__ == "__main__":
    domain = "https://m.77nt.com"
    url = domain + "/99147/"
    response = requestutil.get(url)
    response.encoding = 'utf-8'
    # results = items(response, ".result> h3 > a")
    results = items(response, "a", text_pattern="^第.*章.*$")
    print(results)
    # for url in results:
    #     response = requestutil.get(url)
    #     response.encoding = 'utf-8'
    #     c = content(response)
    #     print("===========================")
    #     print(c)
